Untitled

Reading data from website https://www.worldometers.info/coronavirus/

setwd("C:/R_DS")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## Warning: package 'readr' was built under R version 4.0.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(dplyr)
#reading url using read_html
corona_rul <- read_html("https://www.worldometers.info/coronavirus/")

#reading table from the website 
corona_file <- html_table(corona_rul)

View(corona_file)
corona_file_new <-   corona_file[[1]]
#View(corona_file_new)
str(corona_file_new)
## 'data.frame':    237 obs. of  19 variables:
##  $ #                 : int  NA NA NA NA NA NA NA NA 1 2 ...
##  $ Country,Other     : chr  "North America" "South America" "Asia" "Europe" ...
##  $ TotalCases        : chr  "36,190,724" "21,586,450" "29,439,551" "40,500,869" ...
##  $ NewCases          : chr  "+8,613" "+467" "+104,476" "+60,062" ...
##  $ TotalDeaths       : chr  "824,771" "565,299" "435,292" "927,775" ...
##  $ NewDeaths         : chr  "+210" "+8" "+877" "+1,387" ...
##  $ TotalRecovered    : chr  "27,898,884" "19,239,136" "26,705,986" "28,622,164" ...
##  $ NewRecovered      : chr  "+4,273" "+450" "+66,729" "+82,655" ...
##  $ ActiveCases       : chr  "7,467,069" "1,782,015" "2,298,273" "10,950,930" ...
##  $ Serious,Critical  : chr  "15,433" "21,392" "26,450" "31,600" ...
##  $ Tot Cases/1M pop  : chr  "" "" "" "" ...
##  $ Deaths/1M pop     : chr  "" "" "" "" ...
##  $ TotalTests        : chr  "" "" "" "" ...
##  $ Tests/1M pop      : chr  "" "" "" "" ...
##  $ Population        : chr  "" "" "" "" ...
##  $ Continent         : chr  "North America" "South America" "Asia" "Europe" ...
##  $ 1 Caseevery X ppl : chr  "" "" "" "" ...
##  $ 1 Deathevery X ppl: chr  "" "" "" "" ...
##  $ 1 Testevery X ppl : chr  "" "" "" "" ...
#writing the data to csv file
write.table(corona_file_new, file = "corona_file.csv",
            sep = ",",
            row.names = FALSE)
corona_file_new <- read.csv("C:/R_DS/corona_file.csv")
corona_data <- data.frame(corona_file_new[-1:-8,]) 
head(corona_data)
##    X. Country.Other TotalCases NewCases TotalDeaths NewDeaths TotalRecovered
## 9   1           USA 31,425,966   +5,635     568,834       +57     23,946,970
## 10  2        Brazil 12,984,956              331,530               11,357,521
## 11  3         India 12,625,146  +37,226     165,293      +161     11,698,657
## 12  4        France  4,822,470               96,678                  299,624
## 13  5        Russia  4,589,540   +8,646     100,717      +343      4,211,133
## 14  6            UK  4,362,150   +2,762     126,862       +26      3,912,562
##    NewRecovered ActiveCases Serious.Critical Tot.Cases.1M.pop Deaths.1M.pop
## 9          +267   6,910,162            8,716           94,522         1,711
## 10                1,295,905            8,318           60,761         1,551
## 11      +18,699     761,196            8,944            9,081           119
## 12                4,426,168            5,341           73,757         1,479
## 13       +7,052     277,690            2,300           31,439           690
## 14      +10,920     322,726              517           64,002         1,861
##     TotalTests Tests.1M.pop    Population     Continent X1.Caseevery.X.ppl
## 9  409,404,894    1,231,390   332,473,823 North America                 11
## 10  28,600,000      133,830   213,704,094 South America                 16
## 11 249,019,657      179,116 1,390,271,710          Asia                110
## 12  66,728,544    1,020,577    65,383,138        Europe                 14
## 13 121,900,000      835,035   145,981,988        Europe                 32
## 14 127,546,869    1,871,386    68,156,365        Europe                 16
##    X1.Deathevery.X.ppl X1.Testevery.X.ppl
## 9                  584                  1
## 10                 645                  7
## 11               8,411                  6
## 12                 676                  1
## 13               1,449                  1
## 14                 537                  1
#View(corona_data)
corona_data <- data.frame(corona_file_new[ , -1 ]) 
 corona_data <- data.frame(corona_file_new[c(-1:-8,-228 :-236),  ]) 

 #View(corona_data)
#chaning colnmaes by using rename

corona_data_updated <- corona_data  %>% 
  rename(S.No. ="X."  , Country_Other = "Country.Other", 
    Serious_Critical  = "Serious.Critical", 
         Tot_Cases_1M_pop = "Tot.Cases.1M.pop", 
         Deaths_1M_pop = "Deaths.1M.pop", 
         Tests_1M_pop  ="Tests.1M.pop" ,
        X1_Caseevery_X_ppl   = "X1.Caseevery.X.ppl", 
         X1_Deathevery_X_ppl  = "X1.Deathevery.X.ppl",
        X1_Testevery_X_ppl  = "X1.Testevery.X.ppl", )


head(corona_data_updated)
##    S.No. Country_Other TotalCases NewCases TotalDeaths NewDeaths TotalRecovered
## 9      1           USA 31,425,966   +5,635     568,834       +57     23,946,970
## 10     2        Brazil 12,984,956              331,530               11,357,521
## 11     3         India 12,625,146  +37,226     165,293      +161     11,698,657
## 12     4        France  4,822,470               96,678                  299,624
## 13     5        Russia  4,589,540   +8,646     100,717      +343      4,211,133
## 14     6            UK  4,362,150   +2,762     126,862       +26      3,912,562
##    NewRecovered ActiveCases Serious_Critical Tot_Cases_1M_pop Deaths_1M_pop
## 9          +267   6,910,162            8,716           94,522         1,711
## 10                1,295,905            8,318           60,761         1,551
## 11      +18,699     761,196            8,944            9,081           119
## 12                4,426,168            5,341           73,757         1,479
## 13       +7,052     277,690            2,300           31,439           690
## 14      +10,920     322,726              517           64,002         1,861
##     TotalTests Tests_1M_pop    Population     Continent X1_Caseevery_X_ppl
## 9  409,404,894    1,231,390   332,473,823 North America                 11
## 10  28,600,000      133,830   213,704,094 South America                 16
## 11 249,019,657      179,116 1,390,271,710          Asia                110
## 12  66,728,544    1,020,577    65,383,138        Europe                 14
## 13 121,900,000      835,035   145,981,988        Europe                 32
## 14 127,546,869    1,871,386    68,156,365        Europe                 16
##    X1_Deathevery_X_ppl X1_Testevery_X_ppl
## 9                  584                  1
## 10                 645                  7
## 11               8,411                  6
## 12                 676                  1
## 13               1,449                  1
## 14                 537                  1
#View(corona_data_updated)
#corona_data %>% select(-NewCases,-NewDeaths,-NewRecovered )
#View(corona_data_updated)
#2nd method starts_with() to remove colums

corona_data_updated <-  corona_data_updated %>% dplyr:: select(-starts_with("New"))


head(corona_data_updated)
##    S.No. Country_Other TotalCases TotalDeaths TotalRecovered ActiveCases
## 9      1           USA 31,425,966     568,834     23,946,970   6,910,162
## 10     2        Brazil 12,984,956     331,530     11,357,521   1,295,905
## 11     3         India 12,625,146     165,293     11,698,657     761,196
## 12     4        France  4,822,470      96,678        299,624   4,426,168
## 13     5        Russia  4,589,540     100,717      4,211,133     277,690
## 14     6            UK  4,362,150     126,862      3,912,562     322,726
##    Serious_Critical Tot_Cases_1M_pop Deaths_1M_pop  TotalTests Tests_1M_pop
## 9             8,716           94,522         1,711 409,404,894    1,231,390
## 10            8,318           60,761         1,551  28,600,000      133,830
## 11            8,944            9,081           119 249,019,657      179,116
## 12            5,341           73,757         1,479  66,728,544    1,020,577
## 13            2,300           31,439           690 121,900,000      835,035
## 14              517           64,002         1,861 127,546,869    1,871,386
##       Population     Continent X1_Caseevery_X_ppl X1_Deathevery_X_ppl
## 9    332,473,823 North America                 11                 584
## 10   213,704,094 South America                 16                 645
## 11 1,390,271,710          Asia                110               8,411
## 12    65,383,138        Europe                 14                 676
## 13   145,981,988        Europe                 32               1,449
## 14    68,156,365        Europe                 16                 537
##    X1_Testevery_X_ppl
## 9                   1
## 10                  7
## 11                  6
## 12                  1
## 13                  1
## 14                  1
#View(corona_data_updated)
library(dplyr)
#removing commas in the data
set.seed(1)
 mysub <- function(x) {
   gsub(",","",x)
 }

#APPLYING mysub function to all applicable columns in the dataset
corona_data_updated[,3:12:15] <- apply(corona_data_updated[,3:12:15],MARGIN=2, FUN= mysub )
## Warning in 3:12:15: numerical expression has 10 elements: only the first used

## Warning in 3:12:15: numerical expression has 10 elements: only the first used
View(corona_data_updated)
str(corona_data_updated)
## 'data.frame':    220 obs. of  16 variables:
##  $ S.No.              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Country_Other      : chr  "USA" "Brazil" "India" "France" ...
##  $ TotalCases         : chr  "31425966" "12984956" "12625146" "4822470" ...
##  $ TotalDeaths        : chr  "568834" "331530" "165293" "96678" ...
##  $ TotalRecovered     : chr  "23946970" "11357521" "11698657" "299624" ...
##  $ ActiveCases        : chr  "6910162" "1295905" "761196" "4426168" ...
##  $ Serious_Critical   : chr  "8716" "8318" "8944" "5341" ...
##  $ Tot_Cases_1M_pop   : chr  "94522" "60761" "9081" "73757" ...
##  $ Deaths_1M_pop      : chr  "1711" "1551" "119" "1479" ...
##  $ TotalTests         : chr  "409404894" "28600000" "249019657" "66728544" ...
##  $ Tests_1M_pop       : chr  "1231390" "133830" "179116" "1020577" ...
##  $ Population         : chr  "332473823" "213704094" "1390271710" "65383138" ...
##  $ Continent          : chr  "North America" "South America" "Asia" "Europe" ...
##  $ X1_Caseevery_X_ppl : chr  "11" "16" "110" "14" ...
##  $ X1_Deathevery_X_ppl: chr  "584" "645" "8411" "676" ...
##  $ X1_Testevery_X_ppl : chr  "1" "7" "6" "1" ...
#converting chr type to num  using lapply
colms <-  c(3:12)
corona_data_updated[colms]  <- lapply(corona_data_updated[colms], as.numeric) 
## Warning in lapply(corona_data_updated[colms], as.numeric): NAs introduced by
## coercion

## Warning in lapply(corona_data_updated[colms], as.numeric): NAs introduced by
## coercion
corona_data_updated[14:16]  <- lapply(corona_data_updated[14:16], as.numeric) 
## Warning in lapply(corona_data_updated[14:16], as.numeric): NAs introduced by
## coercion
str(corona_data_updated)
## 'data.frame':    220 obs. of  16 variables:
##  $ S.No.              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Country_Other      : chr  "USA" "Brazil" "India" "France" ...
##  $ TotalCases         : num  31425966 12984956 12625146 4822470 4589540 ...
##  $ TotalDeaths        : num  568834 331530 165293 96678 100717 ...
##  $ TotalRecovered     : num  23946970 11357521 11698657 299624 4211133 ...
##  $ ActiveCases        : num  6910162 1295905 761196 4426168 277690 ...
##  $ Serious_Critical   : num  8716 8318 8944 5341 2300 ...
##  $ Tot_Cases_1M_pop   : num  94522 60761 9081 73757 31439 ...
##  $ Deaths_1M_pop      : num  1711 1551 119 1479 690 ...
##  $ TotalTests         : num  4.09e+08 2.86e+07 2.49e+08 6.67e+07 1.22e+08 ...
##  $ Tests_1M_pop       : num  1231390 133830 179116 1020577 835035 ...
##  $ Population         : num  3.32e+08 2.14e+08 1.39e+09 6.54e+07 1.46e+08 ...
##  $ Continent          : chr  "North America" "South America" "Asia" "Europe" ...
##  $ X1_Caseevery_X_ppl : num  11 16 110 14 32 16 16 24 14 29 ...
##  $ X1_Deathevery_X_ppl: num  584 645 8411 676 1449 ...
##  $ X1_Testevery_X_ppl : num  1 7 6 1 1 1 1 2 1 2 ...
summary(corona_data_updated)
##      S.No.       Country_Other        TotalCases         TotalDeaths     
##  Min.   :  1.0   Length:220         Min.   :        3   Min.   :      1  
##  1st Qu.: 55.5   Class :character   1st Qu.:     4286   1st Qu.:     93  
##  Median :110.0   Mode  :character   Median :    38008   Median :    748  
##  Mean   :110.0                      Mean   :  1200352   Mean   :  27963  
##  3rd Qu.:164.5                      3rd Qu.:   262992   3rd Qu.:   6083  
##  Max.   :219.0                      Max.   :132083861   Max.   :2868477  
##  NA's   :1                                              NA's   :15       
##  TotalRecovered       ActiveCases       Serious_Critical  Tot_Cases_1M_pop
##  Min.   :        1   Min.   :       0   Min.   :    1.0   Min.   :     7  
##  1st Qu.:     3024   1st Qu.:     255   1st Qu.:   13.0   1st Qu.:  1724  
##  Median :    27074   Median :    2952   Median :   74.0   Median : 14607  
##  Mean   :   975276   Mean   :  200203   Mean   : 1360.6   Mean   : 28138  
##  3rd Qu.:   222100   3rd Qu.:   26488   3rd Qu.:  522.5   3rd Qu.: 49536  
##  Max.   :106359121   Max.   :22856263   Max.   :97962.0   Max.   :158818  
##  NA's   :2           NA's   :2          NA's   :76        NA's   :2       
##  Deaths_1M_pop      TotalTests         Tests_1M_pop       Population       
##  Min.   :   0.3   Min.   :      470   Min.   :    688   Min.   :8.030e+02  
##  1st Qu.:  33.5   1st Qu.:   110172   1st Qu.:  47114   1st Qu.:6.560e+05  
##  Median : 215.0   Median :   829634   Median : 232236   Median :6.605e+06  
##  Mean   : 498.5   Mean   :  8605639   Mean   : 542121   Mean   :2.939e+07  
##  3rd Qu.: 769.5   3rd Qu.:  4051128   3rd Qu.: 606038   3rd Qu.:2.385e+07  
##  Max.   :2791.0   Max.   :409404894   Max.   :6392809   Max.   :1.390e+09  
##  NA's   :17       NA's   :14          NA's   :14        NA's   :3          
##   Continent         X1_Caseevery_X_ppl X1_Deathevery_X_ppl X1_Testevery_X_ppl
##  Length:220         Min.   :     6     Min.   :    358     Min.   :  0.00    
##  Class :character   1st Qu.:    20     1st Qu.:   1286     1st Qu.:  2.00    
##  Mode  :character   Median :    70     Median :   4724     Median :  4.00    
##                     Mean   :  3376     Mean   :  90261     Mean   : 25.93    
##                     3rd Qu.:   581     3rd Qu.:  30098     3rd Qu.: 21.00    
##                     Max.   :150108     Max.   :2905295     Max.   :665.00    
##                     NA's   :3          NA's   :18          NA's   :15
library(tidyverse)
corona_data_updated <- na.omit(corona_data_updated)
summary(corona_data_updated)
##      S.No.        Country_Other        TotalCases        TotalDeaths    
##  Min.   :  1.00   Length:139         Min.   :     420   Min.   :     4  
##  1st Qu.: 39.50   Class :character   1st Qu.:   17899   1st Qu.:   175  
##  Median : 83.00   Mode  :character   Median :  117757   Median :  1662  
##  Mean   : 85.48                      Mean   :  897895   Mean   : 19713  
##  3rd Qu.:126.50                      3rd Qu.:  482413   3rd Qu.:  9340  
##  Max.   :198.00                      Max.   :31425966   Max.   :568834  
##  TotalRecovered      ActiveCases      Serious_Critical Tot_Cases_1M_pop
##  Min.   :      44   Min.   :      8   Min.   :   1.0   Min.   :   203  
##  1st Qu.:   13459   1st Qu.:   1302   1st Qu.:  13.0   1st Qu.:  8014  
##  Median :   88585   Median :  11854   Median :  72.0   Median : 23313  
##  Mean   :  731429   Mean   : 146753   Mean   : 696.9   Mean   : 35945  
##  3rd Qu.:  362899   3rd Qu.:  43792   3rd Qu.: 504.5   3rd Qu.: 59481  
##  Max.   :23946970   Max.   :6910162   Max.   :8944.0   Max.   :158818  
##  Deaths_1M_pop      TotalTests         Tests_1M_pop       Population       
##  Min.   :   1.0   Min.   :     1331   Min.   :   3682   Min.   :1.109e+04  
##  1st Qu.:  92.5   1st Qu.:   225622   1st Qu.:  93658   1st Qu.:1.808e+06  
##  Median : 331.0   Median :  1377915   Median : 284796   Median :7.199e+06  
##  Mean   : 611.0   Mean   : 11860445   Mean   : 548750   Mean   :3.636e+07  
##  3rd Qu.: 944.5   3rd Qu.:  7881671   3rd Qu.: 672932   3rd Qu.:3.300e+07  
##  Max.   :2523.0   Max.   :409404894   Max.   :4597508   Max.   :1.390e+09  
##   Continent         X1_Caseevery_X_ppl X1_Deathevery_X_ppl X1_Testevery_X_ppl
##  Length:139         Min.   :   6.0     Min.   :   396      Min.   :  0.00    
##  Class :character   1st Qu.:  17.0     1st Qu.:  1058      1st Qu.:  1.50    
##  Mode  :character   Median :  43.0     Median :  3021      Median :  4.00    
##                     Mean   : 220.7     Mean   : 20018      Mean   : 15.94    
##                     3rd Qu.: 126.0     3rd Qu.: 10840      3rd Qu.: 11.00    
##                     Max.   :4932.0     Max.   :736136      Max.   :272.00
#View(corona_data_updated)

corona_data_updated$Continent <- as.factor(corona_data_updated$Continent) 
str(corona_data_updated$Continent)
##  Factor w/ 6 levels "Africa","Asia",..: 5 6 2 4 4 4 4 2 4 4 ...
data1<-corona_data_updated
df <-corona_data_updated
library(forcats)
library(tidyverse)
library(ggplot2)
#Which continent is having the maximum number of cases?
 
 
continent_TotalCases <- corona_data_updated %>% mutate(Continent = fct_lump(Continent, n=5 ))  %>% 
        group_by(Continent) %>% 
  summarise(TotalCases = max(TotalCases, na.rm=TRUE))%>% arrange(desc(TotalCases))  
## `summarise()` ungrouping output (override with `.groups` argument)
continent_TotalCases
## # A tibble: 6 x 2
##   Continent         TotalCases
##   <fct>                  <dbl>
## 1 North America       31425966
## 2 South America       12984956
## 3 Asia                12625146
## 4 Europe               4822470
## 5 Africa               1551964
## 6 Australia/Oceania      18633
# #plot
# continent_TotalCases <- corona_data_updated %>% mutate(Continent = fct_lump(Continent, n=5 ))  %>% 
#         group_by(Continent) %>% 
#   summarise(TotalCases=max(TotalCases, na.rm=TRUE)) %>% 
#   ggplot(aes(x=Continent, y=  TotalCases))+geom_col()
# 
# Which continent has the least number of deaths?
library(dplyr)
continent_leastdeaths <- corona_data_updated %>% mutate(Continent = fct_lump(Continent, n=5 ))   %>%
         group_by(Continent) %>%   dplyr::summarise(TotalDeaths = min(TotalDeaths, na.rm=TRUE)) %>% arrange(TotalDeaths)
## `summarise()` ungrouping output (override with `.groups` argument)
continent_leastdeaths
## # A tibble: 6 x 2
##   Continent         TotalDeaths
##   <fct>                   <dbl>
## 1 Australia/Oceania           4
## 2 North America              10
## 3 Asia                       12
## 4 Europe                     29
## 5 Africa                     66
## 6 South America              93
# What is the current status of China? 
#There no data for China as it removed in the process of removing NAs 
#I have taken 3 different countries for this question

library(dplyr)
# current status of Brazil
current_status_Brazil <- corona_data_updated   %>%
  filter(corona_data_updated$Country_Other == "Brazil")  %>%
         dplyr::summarise(Country_Other, TotalCases=max(TotalCases, na.rm=TRUE), TotalDeaths=max(TotalDeaths, na.rm=TRUE),ActiveCases=max(ActiveCases, na.rm=TRUE),Serious_Critical=max(Serious_Critical, na.rm=TRUE))
current_status_Brazil
##   Country_Other TotalCases TotalDeaths ActiveCases Serious_Critical
## 1        Brazil   12984956      331530     1295905             8318
# current status of USA
current_status_USA <- corona_data_updated   %>%
  filter(corona_data_updated$Country_Other == "USA")  %>%
         dplyr::summarise(Country_Other, TotalCases=max(TotalCases, na.rm=TRUE), TotalDeaths=max(TotalDeaths, na.rm=TRUE),ActiveCases=max(ActiveCases, na.rm=TRUE),Serious_Critical=max(Serious_Critical, na.rm=TRUE))
current_status_USA
##   Country_Other TotalCases TotalDeaths ActiveCases Serious_Critical
## 1           USA   31425966      568834     6910162             8716
# current status of India
current_status_India <- corona_data_updated   %>%
  filter(corona_data_updated$Country_Other == "India")  %>%
         dplyr::summarise(Country_Other, TotalCases=max(TotalCases, na.rm=TRUE), TotalDeaths=max(TotalDeaths, na.rm=TRUE),ActiveCases=max(ActiveCases, na.rm=TRUE),Serious_Critical=max(Serious_Critical, na.rm=TRUE))
current_status_India
##   Country_Other TotalCases TotalDeaths ActiveCases Serious_Critical
## 1         India   12625146      165293      761196             8944
#************** What is the current status of China? 
#***************** Method-2 using TotalDeaths
#There no data for China as it removed in the process of removing NAs 
#I have taken 3 different countries for this question
#If TotalDeaths is more than the mean of the value the rank allotted  is "1" which indicate its in dangerous situation , if less than mean then rank allotted as "0" which indicates the country comes under safe zone.

library(dplyr)
TotalDeaths_status <- corona_data_updated   %>% dplyr::summarise(TotalDeaths = mean(TotalDeaths, na.rm=TRUE))
#TotalDeaths_status
current_status <- mutate(corona_data_updated, rank = ifelse(corona_data_updated$TotalDeaths >= 13235, "1", "0"))
current_status$rank <- as.factor(current_status$rank)

current_status_country <- current_status %>% filter(current_status$rank== 1)%>% select(Country_Other,TotalCases, TotalDeaths )  


#current_status_country
final_satus <- function(x){
  ifelse(x %in% current_status_country$Country_Other, "The country is in **Dangerous situation**", "The country is Safe")
  }
#status of different countries
final_satus("USA")
## [1] "The country is in **Dangerous situation**"
final_satus("Ireland")
## [1] "The country is Safe"
final_satus("Germany")
## [1] "The country is in **Dangerous situation**"
final_satus("India")
## [1] "The country is in **Dangerous situation**"
final_satus("Egypt")
## [1] "The country is Safe"
# Please arrange all data based on the total number of cases per million population?
 
 cases_per_million_population <- corona_data_updated %>% 
      mutate(corona_data_updated$TotalCases / (corona_data_updated$Population /1000000 )) %>% 
             arrange(desc(TotalCases))

head(cases_per_million_population)
##   S.No. Country_Other TotalCases TotalDeaths TotalRecovered ActiveCases
## 1     1           USA   31425966      568834       23946970     6910162
## 2     2        Brazil   12984956      331530       11357521     1295905
## 3     3         India   12625146      165293       11698657      761196
## 4     4        France    4822470       96678         299624     4426168
## 5     5        Russia    4589540      100717        4211133      277690
## 6     6            UK    4362150      126862        3912562      322726
##   Serious_Critical Tot_Cases_1M_pop Deaths_1M_pop TotalTests Tests_1M_pop
## 1             8716            94522          1711  409404894      1231390
## 2             8318            60761          1551   28600000       133830
## 3             8944             9081           119  249019657       179116
## 4             5341            73757          1479   66728544      1020577
## 5             2300            31439           690  121900000       835035
## 6              517            64002          1861  127546869      1871386
##   Population     Continent X1_Caseevery_X_ppl X1_Deathevery_X_ppl
## 1  332473823 North America                 11                 584
## 2  213704094 South America                 16                 645
## 3 1390271710          Asia                110                8411
## 4   65383138        Europe                 14                 676
## 5  145981988        Europe                 32                1449
## 6   68156365        Europe                 16                 537
##   X1_Testevery_X_ppl
## 1                  1
## 2                  7
## 3                  6
## 4                  1
## 5                  1
## 6                  1
##   corona_data_updated$TotalCases/(corona_data_updated$Population/1e+06)
## 1                                                             94521.625
## 2                                                             60761.382
## 3                                                              9081.064
## 4                                                             73757.090
## 5                                                             31439.084
## 6                                                             64002.093
#View(cases_per_million_population) 
#Which country ranks first based on total number of cases per million population and which country ranks last?


#country ranks first based on total number of cases per million population
cases_per_million_population <- corona_data_updated %>% 
      mutate(corona_data_updated$TotalCases / (corona_data_updated$Population /1000000 )) %>%
  select(Country_Other, TotalCases, Population)%>% arrange(corona_data_updated)

 cases_per_rankONE <- cases_per_million_population %>%
  filter(TotalCases == max(TotalCases) ) %>%
  arrange(Country_Other) %>%
  head(10)
cases_per_rankONE
##   Country_Other TotalCases Population
## 1           USA   31425966  332473823
#country ranks last based on total number of cases per million population
 cases_per_rankLAST <- cases_per_million_population %>%
  filter(TotalCases ==min(TotalCases) ) %>%
  arrange(Country_Other) %>%
  head(10)
cases_per_rankLAST
##       Country_Other TotalCases Population
## 1 Wallis and Futuna        420      11089
corona_new <- current_status %>% select(TotalCases,TotalDeaths,TotalRecovered,rank )
#head(corona_new)

plot(corona_new$TotalCases,corona_new$TotalDeaths , type="b")

#head(corona_new)

boxplot(TotalDeaths ~ rank, corona_new, xlab = "rank", ylab = "TotalDeaths")

hist(corona_new$TotalCases)

with(corona_new, plot(TotalCases, TotalRecovered))

# adding title
with(corona_new,  plot(TotalCases, TotalRecovered))
title(main = "TotalCases  vs TotalRecovered")

# adding colour
with(corona_new,  plot(TotalCases, TotalRecovered, main = "TotalCases  vs TotalRecovered"))
with(subset(corona_new, rank == 1), points(TotalCases, TotalRecovered, col = "blue"))
with(subset(corona_new, rank == 0), points(TotalCases, TotalRecovered, col = "red"))

# Base Plot with Annotation
 
with(corona_new,  plot(TotalCases, TotalRecovered, main = "TotalCases  vs TotalRecovered", type = "n"))
with(subset(corona_new, rank == 1), points(TotalCases, TotalRecovered, col = "blue"))
with(subset(corona_new, rank == 0), points(TotalCases, TotalRecovered, col = "red"))

legend("topleft", pch = 1, col = c("blue", "red"), legend = c("1", "0"))

# Base Plot with Regression Line
with(corona_new,  plot(TotalCases, TotalRecovered, main = "TotalCases  vs TotalRecovered",  pch = 20))
model <- lm(TotalCases ~ TotalRecovered, corona_new)
abline(model, lwd = 2)

# R base scatter plot: plot()
x <- corona_new$TotalCases
y <- corona_new$TotalDeaths
# Plot with main and axis titles
# Change point shape (pch = 19) and remove frame.
plot(x, y, main = "TotalCases  vs TotalRecovered",
     xlab = "TotalCases", ylab = "TotalRecovered",
     pch = 5, frame = FALSE)

# Add regression line
plot(x, y, main = "TotalCases  vs TotalRecovered",
     xlab = "TotalCases", ylab = "TotalRecovered",
     pch = 5, frame = FALSE)
abline(lm(y ~ x, data = corona_new), col = "blue")

# Add loess fit
plot(x, y, main = "TotalCases  vs TotalRecovered",
     xlab = "TotalCases", ylab = "TotalRecovered",
     pch = 5, frame = FALSE)

lines(lowess(x, y), col = "blue")

library(ggplot2)
library("car")
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
scatterplot(TotalDeaths  ~ TotalCases, data = corona_new)

# Suppress the smoother and frame

scatterplot(TotalDeaths  ~ TotalCases, data = corona_new,
            grid = TRUE, frame = FALSE)

scatterplot(TotalDeaths  ~ TotalCases, data = corona_new,
            grid = FALSE, frame = FALSE)

# smoother = FALSE, 
# Scatter plot by groups ("cyl")
library(ggplot2)
scatterplot(TotalDeaths  ~ TotalCases, data = corona_new,grid = FALSE, frame = FALSE)

scatterplot(TotalDeaths  ~ TotalCases, data = corona_new,grid = FALSE, frame = TRUE)

# Scatter Plot Matrices - R Base Graphs
# Basic plots:

pairs(corona_new[,1:4], pch = 2)

# Show only upper panel:
pairs(corona_new[,1:4], pch = 19, lower.panel = NULL)

# Color points by groups (species)
my_cols <- c("#00AFBB", "#E7B800")  
pairs(corona_new[,1:4], pch = 2,  cex = 1,
      col = my_cols[iris$Species],
      lower.panel=NULL)

# Basic box plots
# Box plot of one variable
boxplot(corona_new$TotalRecovered)

# Box plots by groups (dose)
# remove frame
boxplot(corona_new$TotalDeaths  ~ corona_new$TotalCases, data = corona_new, frame = FALSE)

# Horizontal box plots
boxplot(TotalDeaths  ~ TotalCases, data = corona_new,frame = FALSE,
        horizontal = TRUE)

# Notched box plots
boxplot(TotalDeaths  ~ TotalCases, data = corona_new, frame = FALSE,
        notch = TRUE)

# Change group names
#boxplot(TotalDeaths  ~ TotalCases, data = corona_new, frame = FALSE, names = c("D0.5", "D1", "D2"))
# Change color
# Change the color of border using one single color
boxplot(TotalDeaths  ~ TotalCases, data = corona_new,  frame = FALSE,
        border = "steelblue")

# Change the color of border.
#  Use different colors for each group
boxplot(TotalDeaths  ~ TotalCases, data = corona_new,frame = FALSE,
        border = c("#999999", "#E69F00", "#56B4E9"))

# Change fill color : single color
boxplot(TotalDeaths  ~ TotalCases, data = corona_new, frame = FALSE,
        col = "steelblue")

# Change group names
#barplot(corona_new$TotalCases, names.arg = c("A", "B", "C"))
# Bar plot of one variable
barplot(corona_new$TotalCases)

# Horizontal bar plot
barplot(corona_new$TotalCases, horiz = TRUE)

# Line Plots - R Base Graphs
plot(x, y, type = "l", lty = 1)
lines(x, y, type = "l", lty = 1)

#### Plotly
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Scatter plot
fig <- plot_ly(data = corona_data_updated, x = ~corona_data_updated$TotalCases, y = ~corona_data_updated$TotalRecovered, marker = list(size = 10,
                             color = 'pink',
                             line = list(color = 'green',
                                         width = 1)))
fig <- fig %>% layout(title = 'Customized Scatter Plot',
                      yaxis = list(zeroline = FALSE),
                      xaxis = list(zeroline = FALSE))
fig
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# head(data1)
fig <- data1 %>%
  plot_ly(
    x = ~data1$Tests_1M_pop,
    y = ~Deaths_1M_pop,
    size = ~Tests_1M_pop,
        frame = ~Continent  ,
    text = ~Country_Other ,
    hoverinfo = "text",
    type = 'scatter',
    mode = 'markers'
  )

fig <- fig %>% layout(
  xaxis = list(
    type = "log"
  )
)

fig
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
#fig <- plot_ly(x = ~data1$TotalCases, y = ~data1$TotalDeaths, z = ~data1$TotalRecovered,data=, type = #Data #Visualisation

fig <- plot_ly(data1, x = ~Tot_Cases_1M_pop, y = ~Continent, name = "TotalCases", 
               type = 'scatter', mode = "markers", 
               marker = list(color = "red", opacity = 0.4), size =5)%>% 
  add_trace(x = ~ActiveCases, y = ~Continent, name = "ActiveCases",
            type  ='scatter',mode = "markers", marker = list(color = "blue", opacity = 0.4)) %>% 
  layout(title = "Total Deaths vs Active vs Recovered cases")

fig
# Data Preparation
labels <- data1$Continent
values <- data1$Deaths_1M_pop
# Data Visualization
fig <- plot_ly(type='pie', labels=labels, values=values, 
               textinfo='label+percent',
               insidetextorientation='radial') %>% layout(title = "Total deaths")
fig
# Data Preparation
labels <- data1$Continent
values <- data1$ActiveCases

# Data Visualization
fig <- plot_ly(type='pie', labels=labels, values=values, 
               textinfo='label+percent',
               insidetextorientation='radial') %>% layout(title = "Total ActiveCases")
fig